// { dg-do compile { target { i?86-*-* x86_64-*-* }  && { ! *-*-solaris* } } }
// { dg-options "-Ofast -funroll-loops -fopenmp -march=knl" }
// Disabling epilogues until we find a better way to deal with scans.
// { dg-additional-options "--param vect-epilogues-nomask=0" }


#include <math.h>

#include <xmmintrin.h>

#define max(a, b)   ( (a) > (b) ? (a) : (b) )

struct Sdata {
  float w; 
  float s;
  float r;
  float t;
  float v;
};
 extern int N1, N2, N3;

#define func(p, up, down) ((p)*(up) + (1.0f-(p)) * (down))
 
void foo (Sdata *in, int idx, float *out)
{
  float* y1 = (float*)_mm_malloc(sizeof(float) * N1,16);
  float* y2  = (float*)_mm_malloc(sizeof(float) * N1,16);
  float* y3  = (float*)_mm_malloc(sizeof(float) * N1,16);
  float* y4  = (float*)_mm_malloc(sizeof(float) * N1,16); 

  for (int k = idx; k < idx + N3; k++) {
    float x1 = in[k].r;
    float x2    = in[k].s;
    float x3      = in[k].w;
    float x4     = in[k].v;
    float x5         = in[k].t;
    x5 /= N2;
    float u = exp(x4 * sqrt(x5));
    float d = exp(-x4 * sqrt(x5));
    float a = exp(x1 * x5);
    float m = exp(-x1 * x5);
    float p = (a - d) / (u - d);
    y2[0] = x2;
    y3[0] = float(1.f);
    for (int i = 1; i <= N2; i++) {
      y2[i] = u * y2[i - 1];
      y3[i] = d * y3[i - 1];
    }
#pragma omp simd
    for (int i = 0; i <= N2; i++) {
      y1[i] =
        max((x3 - y2[N2 - i] * y3[i]), float(0.f));
    }
    for (int i = N2 - 1; i >= 0; i--) {
#pragma omp simd
      for (int j = 0; j <= i; j++) {
        y4[j] = func(p,y1[j],y1[j+1]) * m;
      }
#pragma omp simd
      for (int j = 0; j <= i; j++) {
        float t1 = y2[i - j] * y3[j];
        float t2 = max(x3 - t1, float(0.f));
        y1[j] = max(t2, y4[j]);
      }
    }
    out[k] = y1[0];
  }
  _mm_free(y1);
  _mm_free(y2);
  _mm_free(y3);
  _mm_free(y4);
}

// { dg-final { scan-assembler-not "vmovaps\[^\n\r]*zmm\[0-9]+,\[^\n\r]*zmm\[0-9]+" } }
